
##########################################################################
##### Prepare analysis with unsupervised techniques, EU speeches  ########
##########################################################################

# OVERVIEW, what we do here:
# I. Prepare meta data for stm topic modeling
# II. Load, check and sort supercorpus as per dictionary results
# III. Apply Danny and Spirlings diagnostics to find out about effects of text cleaning

# clean environment
rm(list = ls())

# Set working directory and put the "Data" folder and all files in "RData_for_directory" here!
setwd("")

# load packages
library(dplyr)
library(tidyverse)
library(tidytext)
library(quanteda)
library(stm)
library(stringr)
library(Rtsne)
library(rsvd)
library(geometry)
library(igraph)
library(stmCorrViz)
library(preText)

# load each single .RData file in the folder "1Data" at once and build "supercorpus"
fileDir <- "1Data"
files <- list.files(fileDir,pattern="*.RData", full.names=TRUE)

# make sure to update here total number of cases (currently 38, thus put 39 here!)
for(i in 1:length(files)){
  load(files[i])
  if(i==1){supercorpus <- speech_corpus}
  if(i>1 & i<41){supercorpus <- supercorpus + speech_corpus}
  if(i>41){
    data$documents[[which(is.na(names(data$documents)))]] <- NULL
    supercorpus <- supercorpus + speech_corpus}
}


#########################------------------###################################
#########################------------------###################################

# I. PREP: compile data set on meta data

# the key innovation of the stm package is that it allows to include metadata into the analysis
# here, we might want to include speaker, date, regime type and the results of the dictionary and call this meta
speaker <- docvars(supercorpus, "speaker")
date <- docvars(supercorpus, "date")
regime <- docvars(supercorpus, "regime")
id <- names(texts(supercorpus))
# id is very important here because meta and alldfm have to be sorted in same way (by id of texts)
# otherwise stm models confuse results!
# compile dataset with metadata
meta <- data.frame(speaker, date, regime, id)
# turn all columns to character vectors and add id as factor
meta <- data.frame(lapply(meta, as.character), stringsAsFactors=FALSE)
# add index to sort later by id again
meta$index <- seq.int(nrow(meta))

# include another column with our dictionary results "scaleideo" 
# (this is good for plotting later the correlation between
# topics and speakers with illiberal/liberal rhetoric to see what illiberl/liberal rhetoric is really about)
load("model_dictionary.RData")
# load also the overall scale scores per speaker
load("model_speaker.RData")

# simplify model
model$id <- rownames(model)
dic <- model[, c(5,9)]

# ok, now we can merge without error
meta <- merge(meta, dic, by = "id", all = TRUE)
# now, importantly, sort meta again by ID, first alphabetically, than numerically
meta <- meta[order(meta$index),]

# lastly, create binary variable based on dictionary results: <0 = illiberal, >0=liberal
meta$ideo <- ifelse(meta$scaleideo < 0, "illiberal", "liberal")

# now add export model with scale sum to be sorted as per dictionary results
# first, rename colums in export
colnames(export)[colnames(export)=="scaleideo"] <- "scale_position"
colnames(export)[colnames(export)=="autodemo"] <- "speaker"

meta <- merge(meta, export, by = "speaker")
meta <- meta[order(-meta$scale_position),]
#meta <- meta[order(meta$index),]

# ok, export data for main stm script:
save(meta, file = "meta_stm.RData")


#################################################################

# II. sort supercorpus as per scale results and export files for later usage

#################################################################

# ok, let's now sort it as per index again to be merged with supercorpus data
merger <- meta[order(meta$index),]


# directly sorting the supercorpus is currently not possible,
# so, we collect all metadata in one data.frame 
# and then add and sort it by scale_position and finally built the supercorpus again

# first, prepare texts with id to be merged with merger
texts <- texts(supercorpus)
texts[1]
names(texts)
texts <- as.data.frame(texts, names(texts))
texts[1] <- lapply(texts[1], as.character)
texts[,2] <- rownames(texts)
colnames(texts)[colnames(texts)=="V2"] <- "id"
# now merge
preptexts <- merge(merger, texts, by = "id", all = TRUE)
# order now by index
preptexts <- preptexts[order(preptexts$index),]
# erase what we don't need
preptexts <- preptexts[-c(6:7)]

# now get the other metadata
title <- docvars(supercorpus, "title")
country <- docvars(supercorpus, "country")
source <- docvars(supercorpus, "source")
language <- docvars(supercorpus, "language")

# add them to preptexts
preptexts[,8] <- title
preptexts[,9] <- country
preptexts[,10] <- source
preptexts[,11] <- language

# now, sort the entire thing as per scale_position 
preptexts <- preptexts[order(-preptexts$scale_position),]

# now turn back into supercorpus
# first, rename columns as needed for this transformation
colnames(preptexts)[colnames(preptexts)=="texts"] <- "text"
colnames(preptexts)[colnames(preptexts)=="id"] <- "doc_id"
colnames(preptexts)[colnames(preptexts)=="V8"] <- "title"
colnames(preptexts)[colnames(preptexts)=="V9"] <- "country"
colnames(preptexts)[colnames(preptexts)=="V10"] <- "source"
colnames(preptexts)[colnames(preptexts)=="V11"] <- "language"
supercorpus <- corpus(preptexts, docid_field = "doc_id", text_field = "text")

# save this sorted supercorpus so we don't have to do this everytime
save(supercorpus, file = "sorted_supercorpus.RData")


# check if all speakers (our cases !) are included in supercorpus
speaker <- docvars(supercorpus, "speaker")
speaker_unique <- unique(speaker)
speaker_unique

# export this for main stm script
save(speaker_unique, file = "speaker_unique.RData")

# export also the vector with all documents for main stm script
findtexts <- texts(supercorpus)
save(findtexts, file = "all_documents.RData")

#########################################

###### III. Testing the effects of preprocessing and cleaning  ######

#########################################

#########################---------------#####################################
#########################---------------#####################################


# apply Denny and Spirling's techniques (preText package) to make sure that our cleaning procedures do not have huge effects on results
# code adopted from: http://www.mjdenny.com/getting_started_with_preText.html

# basic idea: take a sample of around 500 texts from corpus and test all kind of preprocessing techniques
# compare the pairwise distance between documents in sample for each of the combinations of preprocessing techniques
# assess results: if there is huge difference, double check if this particular preprocessing feature stands on valid theoretical ground
# or should be spared due to its comparatively huge effect on the results

# first, take sample from speech corpus 
sampling <- texts(supercorpus)
# sample without replacement
sample_speeches <- sample(sampling, 500,
                          replace=FALSE)

# we can now make use of the factorial_preprocessing() function, 
# which will preprocess the data 64 or 128 different ways (depending on whether n-grams are included).
preprocessed_documents <- factorial_preprocessing(
  sample_speeches,
  use_ngrams = FALSE,
  infrequent_term_threshold = 0.2,
  verbose = FALSE)

# this function will output a list object with three fields. 
# The first of these is $choices, a data.frame containing indicators for each of the preprocessing steps used. 
# The second is $dfm_list, which is a list with 64 or 128 entries, 
# each of which contains a quanteda::dfm object preprocessed according to the specification in the corresponding row in choices. 
# Each DFM in this list will be labeled to match the row names in choices, but you can also access these labels from the $labels field. We can look at the first few rows of choices below:
names(preprocessed_documents)

head(preprocessed_documents$choices)

# Now that we have our preprocessed documents, we can perform the preText procedure on the factorial preprocessed corpus using the preText() function. 
preText_results <- preText(
  preprocessed_documents,
  dataset_name = "EU Speeches",
  distance_method = "cosine",
  num_comparisons = 50,
  verbose = TRUE)


#The preText() function returns a list of result with four fields:
# preText_scores: A data.frame containing preText scores and preprocessing step labels for each preprocessing step as columns. Note that there is no preText score for the case of no prepprocessing steps.
# ranked_preText_scores: A data.frame that is identical to $preText_scores except that it is ordered by the magnitude of the preText score
# choices: A data.frame containing binary indicators of which preprocessing steps were applied to factorial preprocessed DFM.
# regression_results: A data.frame containing regression results where indicators for each preprocessing decision are regressed on the preText score for that specification.

#We can now feed these results to two functions that will help us make better sense of them. 
# preText_score_plot() creates a dot plot of scores for each preprocessing specification:
preText_score_plot(preText_results)

# Here, the least risky specifications have the lowest preText score and are displayed at the top of the plot. 
# We can also see the conditional effects of each preprocessing step on the mean preText score for each specification that included that step. 
# Here again, a negative coefficient indicates that a step tends to reduce the unusualness of the results, 
# while a positive coefficient indicates that applying the step is likely to produce more unusual results for that corpus.

# here, we can get an even better overview on those techniques which might influences the results the most
regression_coefficient_plot(preText_results,
                            remove_intercept = TRUE)

# conclusions: our combination of preprocessing features is close to the mean of scores in the plot (even below), thus comparatively okay
# stopword removal is expected to have high effects on the results - yet, we still do this but make the list of stopwords transparent (see above)
# to show that this includes only word fragments or irrelevant terms

# overall, our preprocessing seems fine.

# report this in Appendix, refer to it in foodnote which explains cleaning procedures

############################################

#### IV. Apply unsupervised techniques ####

############################################

# see next script!
